/*
 * Copyright (c) 2021, Peter Abeles. All Rights Reserved.
 *
 * This file is part of BoofCV (http://boofcv.org).
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package boofcv.alg.filter.convolve.normalized;

import boofcv.struct.convolve.*;
import boofcv.struct.image.*;

import javax.annotation.Generated;
import java.util.Arrays;

/**
 * <p>
 * Covolves a 1D kernel in the horizontal or vertical direction across an image's border only, while re-normalizing the
 * kernel sum to one. The kernel MUST be smaller than the image.
 * </p>
 * 
 * <p>
 * NOTE: Do not modify. Automatically generated by GenerateConvolveNormalized_JustBorder_IL
 * </p>
 * 
 * @author Peter Abeles
 */
@Generated({"boofcv.alg.filter.convolve.normalized.GenerateConvolveNormalized_JustBorder_IL"})
@SuppressWarnings({"ForLoopReplaceableByForEach"})
public class ConvolveNormalized_JustBorder_IL {

	public static void horizontal(Kernel1D_F32 kernel, InterleavedF32 src, InterleavedF32 dst ) {
		final float[] dataSrc = src.data;
		final float[] dataDst = dst.data;

		final int kernelWidth = kernel.getWidth();
		final int offsetL = kernel.getOffset();
		final int offsetR = kernelWidth-offsetL-1;

		final int width = src.getWidth();
		final int height = src.getHeight();
		final int numBands = src.getNumBands();
		final float[] total = new float[ numBands ];

		for (int i = 0; i < height; i++) {
			int indexDst = dst.startIndex + i*dst.stride;
			for (int j = 0; j < offsetL; j++) {
				int indexSrc = src.startIndex + i*src.stride;
				Arrays.fill(total,0);
				float weight = 0;
				for (int k = offsetL-j; k < kernelWidth; k++) {
					float w = kernel.data[k];
					weight += w;
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc++]) * w;
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (total[band]/weight);
				}
			}

			indexDst = dst.startIndex + i*dst.stride + (width-offsetR)*numBands;
			for (int j = offsetR-1; j >= 0; j--) {
				int indexSrc = src.startIndex + i*src.stride + (width-offsetL-j-1)*numBands;
				Arrays.fill(total,0);
				float weight = 0;
				for (int k = 0; k <= offsetL+j; k++) {
					float w = kernel.data[k];
					weight += w;
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc++]) * w;
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (total[band]/weight);
				}
			}
		}
	}

	public static void vertical(Kernel1D_F32 kernel, InterleavedF32 input, InterleavedF32 output ) {
		final float[] dataSrc = input.data;
		final float[] dataDst = output.data;

		final int kernelWidth = kernel.getWidth();
		final int offsetL = kernel.getOffset();
		final int offsetR = kernelWidth-offsetL-1;

		final int imgWidth = output.getWidth();
		final int imgHeight = output.getHeight();
		final int numBands = output.getNumBands();
		final float total[] = new float[ numBands ];

		final int yEnd = imgHeight - offsetR;

		for (int y = 0; y < offsetL; y++) {
			int indexDst = output.startIndex + y*output.stride;
			int i = input.startIndex + y*input.stride;
			final int iEnd = i + imgWidth*numBands;

			int kStart = offsetL - y;

			float weight = 0;
			for (int k = kStart; k < kernelWidth; k++) {
				weight += kernel.data[k];
			}

			for ( ; i < iEnd; i += numBands) {
				Arrays.fill(total,0);
				int indexSrc = i - y * input.stride;
				for (int k = kStart; k < kernelWidth; k++, indexSrc += input.stride) {
					float w = kernel.data[k];
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc+band]) * w;
					}
				}
				for (int band = 0; band < numBands; band++){
					dataDst[indexDst++] = (total[band]/weight);
				}
			}
		}

		for (int y = yEnd; y < imgHeight; y++) {
			int indexDst = output.startIndex + y * output.stride;
			int i = input.startIndex + y*input.stride;
			final int iEnd = i + imgWidth*numBands;

			int kEnd = imgHeight - (y - offsetL);

			float weight = 0;
			for (int k = 0; k < kEnd; k++) {
				weight += kernel.data[k];
			}

			for ( ; i < iEnd; i+=numBands ) {
				Arrays.fill(total,0);
				int indexSrc = i - offsetL*input.stride;
				for (int k = 0; k < kEnd; k++, indexSrc += input.stride) {
					float w = kernel.data[k];
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc+band]) * w;
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (total[band]/weight);
				}
			}
		}
	}

	public static void convolve(Kernel2D_F32 kernel, InterleavedF32 src, InterleavedF32 dst ) {
		final float[] dataSrc = src.data;
		final float[] dataDst = dst.data;

		final int kernelWidth = kernel.getWidth();
		final int offsetL = kernel.getOffset();
		final int offsetR = kernelWidth-offsetL-1;

		final int width = src.getWidth();
		final int height = src.getHeight();
		final int numBands = dst.getNumBands();
		final float total[] = new float[ numBands ];

		// convolve across the left and right borders
		for (int y = 0; y < height; y++) {

			int minI = y >= offsetL ? -offsetL : -y;
			int maxI = y < height - offsetR ?  offsetR : height - y - 1;

			int indexDst = dst.startIndex + y*dst.stride;

			for( int x = 0; x < offsetL; x++ ) {

				Arrays.fill(total,0);
				float weight = 0;

				for( int i = minI; i <= maxI; i++ ) {
					int indexSrc = src.startIndex + (y+i)*src.stride;
					int indexKer = (i+offsetL)*kernelWidth;

					for( int j = -x; j <= offsetR; j++ ) {
						float w = kernel.data[indexKer+j+offsetL];
						weight += w;
						for (int band = 0; band < numBands; band++) {
							total[band] += (dataSrc[indexSrc++]) * w;
						}
					}
				}

				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (total[band]/weight);
				}
			}

			indexDst = dst.startIndex + y*dst.stride + (width-offsetR)*numBands;
			for( int x = width-offsetR; x < width; x++ ) {

				int maxJ = width-x-1;

				Arrays.fill(total,0);
				float weight = 0;

				for( int i = minI; i <= maxI; i++ ) {
					int indexSrc = src.startIndex + (y+i)*src.stride + (x-offsetL)*numBands;
					int indexKer = (i+offsetL)*kernelWidth;

					for( int j = -offsetL; j <= maxJ; j++ ) {
						float w = kernel.data[indexKer+j+offsetL];
						weight += w;
						for (int band = 0; band < numBands; band++) {
							total[band] += (dataSrc[indexSrc++]) * w;
						}
					}
				}

				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (total[band]/weight);
				}
			}
		}

		// convolve across the top border while avoiding convolving the corners again
		for (int y = 0; y < offsetL; y++) {

			int indexDst = dst.startIndex + y*dst.stride + offsetL*numBands;

			for( int x = offsetL; x < width-offsetR; x++ ) {

				Arrays.fill(total,0);
				float weight = 0;

				for( int i = -y; i <= offsetR; i++ ) {
					int indexSrc = src.startIndex + (y+i)*src.stride + (x-offsetL)*numBands;
					int indexKer = (i+offsetL)*kernelWidth;

					for( int j = -offsetL; j <= offsetR; j++ ) {
						float w = kernel.data[indexKer+j+offsetL];
						weight += w;
						for (int band = 0; band < numBands; band++) {
							total[band] += (dataSrc[indexSrc++]) * w;
						}
					}
				}

				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (total[band]/weight);
				}
			}
		}

		// convolve across the bottom border
		for (int y = height-offsetR; y < height; y++) {

			int maxI = height - y - 1;
			int indexDst = dst.startIndex + y*dst.stride + offsetL*numBands;

			for( int x = offsetL; x < width-offsetR; x++ ) {

				Arrays.fill(total,0);
				float weight = 0;

				for( int i = -offsetL; i <= maxI; i++ ) {
					int indexSrc = src.startIndex + (y+i)*src.stride + (x-offsetL)*numBands;
					int indexKer = (i+offsetL)*kernelWidth;

					for( int j = -offsetL; j <= offsetR; j++ ) {
						float w = kernel.data[indexKer+j+offsetL];
						weight += w;
						for (int band = 0; band < numBands; band++) {
							total[band] += (dataSrc[indexSrc++]) * w;
						}
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (total[band]/weight);
				}
			}
		}
	}

	public static void horizontal(Kernel1D_F64 kernel, InterleavedF64 src, InterleavedF64 dst ) {
		final double[] dataSrc = src.data;
		final double[] dataDst = dst.data;

		final int kernelWidth = kernel.getWidth();
		final int offsetL = kernel.getOffset();
		final int offsetR = kernelWidth-offsetL-1;

		final int width = src.getWidth();
		final int height = src.getHeight();
		final int numBands = src.getNumBands();
		final double[] total = new double[ numBands ];

		for (int i = 0; i < height; i++) {
			int indexDst = dst.startIndex + i*dst.stride;
			for (int j = 0; j < offsetL; j++) {
				int indexSrc = src.startIndex + i*src.stride;
				Arrays.fill(total,0);
				double weight = 0;
				for (int k = offsetL-j; k < kernelWidth; k++) {
					double w = kernel.data[k];
					weight += w;
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc++]) * w;
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (total[band]/weight);
				}
			}

			indexDst = dst.startIndex + i*dst.stride + (width-offsetR)*numBands;
			for (int j = offsetR-1; j >= 0; j--) {
				int indexSrc = src.startIndex + i*src.stride + (width-offsetL-j-1)*numBands;
				Arrays.fill(total,0);
				double weight = 0;
				for (int k = 0; k <= offsetL+j; k++) {
					double w = kernel.data[k];
					weight += w;
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc++]) * w;
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (total[band]/weight);
				}
			}
		}
	}

	public static void vertical(Kernel1D_F64 kernel, InterleavedF64 input, InterleavedF64 output ) {
		final double[] dataSrc = input.data;
		final double[] dataDst = output.data;

		final int kernelWidth = kernel.getWidth();
		final int offsetL = kernel.getOffset();
		final int offsetR = kernelWidth-offsetL-1;

		final int imgWidth = output.getWidth();
		final int imgHeight = output.getHeight();
		final int numBands = output.getNumBands();
		final double total[] = new double[ numBands ];

		final int yEnd = imgHeight - offsetR;

		for (int y = 0; y < offsetL; y++) {
			int indexDst = output.startIndex + y*output.stride;
			int i = input.startIndex + y*input.stride;
			final int iEnd = i + imgWidth*numBands;

			int kStart = offsetL - y;

			double weight = 0;
			for (int k = kStart; k < kernelWidth; k++) {
				weight += kernel.data[k];
			}

			for ( ; i < iEnd; i += numBands) {
				Arrays.fill(total,0);
				int indexSrc = i - y * input.stride;
				for (int k = kStart; k < kernelWidth; k++, indexSrc += input.stride) {
					double w = kernel.data[k];
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc+band]) * w;
					}
				}
				for (int band = 0; band < numBands; band++){
					dataDst[indexDst++] = (total[band]/weight);
				}
			}
		}

		for (int y = yEnd; y < imgHeight; y++) {
			int indexDst = output.startIndex + y * output.stride;
			int i = input.startIndex + y*input.stride;
			final int iEnd = i + imgWidth*numBands;

			int kEnd = imgHeight - (y - offsetL);

			double weight = 0;
			for (int k = 0; k < kEnd; k++) {
				weight += kernel.data[k];
			}

			for ( ; i < iEnd; i+=numBands ) {
				Arrays.fill(total,0);
				int indexSrc = i - offsetL*input.stride;
				for (int k = 0; k < kEnd; k++, indexSrc += input.stride) {
					double w = kernel.data[k];
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc+band]) * w;
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (total[band]/weight);
				}
			}
		}
	}

	public static void convolve(Kernel2D_F64 kernel, InterleavedF64 src, InterleavedF64 dst ) {
		final double[] dataSrc = src.data;
		final double[] dataDst = dst.data;

		final int kernelWidth = kernel.getWidth();
		final int offsetL = kernel.getOffset();
		final int offsetR = kernelWidth-offsetL-1;

		final int width = src.getWidth();
		final int height = src.getHeight();
		final int numBands = dst.getNumBands();
		final double total[] = new double[ numBands ];

		// convolve across the left and right borders
		for (int y = 0; y < height; y++) {

			int minI = y >= offsetL ? -offsetL : -y;
			int maxI = y < height - offsetR ?  offsetR : height - y - 1;

			int indexDst = dst.startIndex + y*dst.stride;

			for( int x = 0; x < offsetL; x++ ) {

				Arrays.fill(total,0);
				double weight = 0;

				for( int i = minI; i <= maxI; i++ ) {
					int indexSrc = src.startIndex + (y+i)*src.stride;
					int indexKer = (i+offsetL)*kernelWidth;

					for( int j = -x; j <= offsetR; j++ ) {
						double w = kernel.data[indexKer+j+offsetL];
						weight += w;
						for (int band = 0; band < numBands; band++) {
							total[band] += (dataSrc[indexSrc++]) * w;
						}
					}
				}

				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (total[band]/weight);
				}
			}

			indexDst = dst.startIndex + y*dst.stride + (width-offsetR)*numBands;
			for( int x = width-offsetR; x < width; x++ ) {

				int maxJ = width-x-1;

				Arrays.fill(total,0);
				double weight = 0;

				for( int i = minI; i <= maxI; i++ ) {
					int indexSrc = src.startIndex + (y+i)*src.stride + (x-offsetL)*numBands;
					int indexKer = (i+offsetL)*kernelWidth;

					for( int j = -offsetL; j <= maxJ; j++ ) {
						double w = kernel.data[indexKer+j+offsetL];
						weight += w;
						for (int band = 0; band < numBands; band++) {
							total[band] += (dataSrc[indexSrc++]) * w;
						}
					}
				}

				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (total[band]/weight);
				}
			}
		}

		// convolve across the top border while avoiding convolving the corners again
		for (int y = 0; y < offsetL; y++) {

			int indexDst = dst.startIndex + y*dst.stride + offsetL*numBands;

			for( int x = offsetL; x < width-offsetR; x++ ) {

				Arrays.fill(total,0);
				double weight = 0;

				for( int i = -y; i <= offsetR; i++ ) {
					int indexSrc = src.startIndex + (y+i)*src.stride + (x-offsetL)*numBands;
					int indexKer = (i+offsetL)*kernelWidth;

					for( int j = -offsetL; j <= offsetR; j++ ) {
						double w = kernel.data[indexKer+j+offsetL];
						weight += w;
						for (int band = 0; band < numBands; band++) {
							total[band] += (dataSrc[indexSrc++]) * w;
						}
					}
				}

				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (total[band]/weight);
				}
			}
		}

		// convolve across the bottom border
		for (int y = height-offsetR; y < height; y++) {

			int maxI = height - y - 1;
			int indexDst = dst.startIndex + y*dst.stride + offsetL*numBands;

			for( int x = offsetL; x < width-offsetR; x++ ) {

				Arrays.fill(total,0);
				double weight = 0;

				for( int i = -offsetL; i <= maxI; i++ ) {
					int indexSrc = src.startIndex + (y+i)*src.stride + (x-offsetL)*numBands;
					int indexKer = (i+offsetL)*kernelWidth;

					for( int j = -offsetL; j <= offsetR; j++ ) {
						double w = kernel.data[indexKer+j+offsetL];
						weight += w;
						for (int band = 0; band < numBands; band++) {
							total[band] += (dataSrc[indexSrc++]) * w;
						}
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (total[band]/weight);
				}
			}
		}
	}

	public static void horizontal(Kernel1D_S32 kernel, InterleavedU8 src, InterleavedI8 dst ) {
		final byte[] dataSrc = src.data;
		final byte[] dataDst = dst.data;

		final int kernelWidth = kernel.getWidth();
		final int offsetL = kernel.getOffset();
		final int offsetR = kernelWidth-offsetL-1;

		final int width = src.getWidth();
		final int height = src.getHeight();
		final int numBands = src.getNumBands();
		final int[] total = new int[ numBands ];

		for (int i = 0; i < height; i++) {
			int indexDst = dst.startIndex + i*dst.stride;
			for (int j = 0; j < offsetL; j++) {
				int indexSrc = src.startIndex + i*src.stride;
				Arrays.fill(total,0);
				int weight = 0;
				for (int k = offsetL-j; k < kernelWidth; k++) {
					int w = kernel.data[k];
					weight += w;
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc++]& 0xFF) * w;
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (byte)((total[band]+weight/2)/weight);
				}
			}

			indexDst = dst.startIndex + i*dst.stride + (width-offsetR)*numBands;
			for (int j = offsetR-1; j >= 0; j--) {
				int indexSrc = src.startIndex + i*src.stride + (width-offsetL-j-1)*numBands;
				Arrays.fill(total,0);
				int weight = 0;
				for (int k = 0; k <= offsetL+j; k++) {
					int w = kernel.data[k];
					weight += w;
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc++]& 0xFF) * w;
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (byte)((total[band]+weight/2)/weight);
				}
			}
		}
	}

	public static void vertical(Kernel1D_S32 kernel, InterleavedU8 input, InterleavedI8 output ) {
		final byte[] dataSrc = input.data;
		final byte[] dataDst = output.data;

		final int kernelWidth = kernel.getWidth();
		final int offsetL = kernel.getOffset();
		final int offsetR = kernelWidth-offsetL-1;

		final int imgWidth = output.getWidth();
		final int imgHeight = output.getHeight();
		final int numBands = output.getNumBands();
		final int total[] = new int[ numBands ];

		final int yEnd = imgHeight - offsetR;

		for (int y = 0; y < offsetL; y++) {
			int indexDst = output.startIndex + y*output.stride;
			int i = input.startIndex + y*input.stride;
			final int iEnd = i + imgWidth*numBands;

			int kStart = offsetL - y;

			int weight = 0;
			for (int k = kStart; k < kernelWidth; k++) {
				weight += kernel.data[k];
			}

			for ( ; i < iEnd; i += numBands) {
				Arrays.fill(total,0);
				int indexSrc = i - y * input.stride;
				for (int k = kStart; k < kernelWidth; k++, indexSrc += input.stride) {
					int w = kernel.data[k];
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc+band]& 0xFF) * w;
					}
				}
				for (int band = 0; band < numBands; band++){
					dataDst[indexDst++] = (byte)((total[band]+weight/2)/weight);
				}
			}
		}

		for (int y = yEnd; y < imgHeight; y++) {
			int indexDst = output.startIndex + y * output.stride;
			int i = input.startIndex + y*input.stride;
			final int iEnd = i + imgWidth*numBands;

			int kEnd = imgHeight - (y - offsetL);

			int weight = 0;
			for (int k = 0; k < kEnd; k++) {
				weight += kernel.data[k];
			}

			for ( ; i < iEnd; i+=numBands ) {
				Arrays.fill(total,0);
				int indexSrc = i - offsetL*input.stride;
				for (int k = 0; k < kEnd; k++, indexSrc += input.stride) {
					int w = kernel.data[k];
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc+band]& 0xFF) * w;
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (byte)((total[band]+weight/2)/weight);
				}
			}
		}
	}

	public static void convolve(Kernel2D_S32 kernel, InterleavedU8 src, InterleavedI8 dst ) {
		final byte[] dataSrc = src.data;
		final byte[] dataDst = dst.data;

		final int kernelWidth = kernel.getWidth();
		final int offsetL = kernel.getOffset();
		final int offsetR = kernelWidth-offsetL-1;

		final int width = src.getWidth();
		final int height = src.getHeight();
		final int numBands = dst.getNumBands();
		final int total[] = new int[ numBands ];

		// convolve across the left and right borders
		for (int y = 0; y < height; y++) {

			int minI = y >= offsetL ? -offsetL : -y;
			int maxI = y < height - offsetR ?  offsetR : height - y - 1;

			int indexDst = dst.startIndex + y*dst.stride;

			for( int x = 0; x < offsetL; x++ ) {

				Arrays.fill(total,0);
				int weight = 0;

				for( int i = minI; i <= maxI; i++ ) {
					int indexSrc = src.startIndex + (y+i)*src.stride;
					int indexKer = (i+offsetL)*kernelWidth;

					for( int j = -x; j <= offsetR; j++ ) {
						int w = kernel.data[indexKer+j+offsetL];
						weight += w;
						for (int band = 0; band < numBands; band++) {
							total[band] += (dataSrc[indexSrc++]& 0xFF) * w;
						}
					}
				}

				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (byte)((total[band]+weight/2)/weight);
				}
			}

			indexDst = dst.startIndex + y*dst.stride + (width-offsetR)*numBands;
			for( int x = width-offsetR; x < width; x++ ) {

				int maxJ = width-x-1;

				Arrays.fill(total,0);
				int weight = 0;

				for( int i = minI; i <= maxI; i++ ) {
					int indexSrc = src.startIndex + (y+i)*src.stride + (x-offsetL)*numBands;
					int indexKer = (i+offsetL)*kernelWidth;

					for( int j = -offsetL; j <= maxJ; j++ ) {
						int w = kernel.data[indexKer+j+offsetL];
						weight += w;
						for (int band = 0; band < numBands; band++) {
							total[band] += (dataSrc[indexSrc++]& 0xFF) * w;
						}
					}
				}

				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (byte)((total[band]+weight/2)/weight);
				}
			}
		}

		// convolve across the top border while avoiding convolving the corners again
		for (int y = 0; y < offsetL; y++) {

			int indexDst = dst.startIndex + y*dst.stride + offsetL*numBands;

			for( int x = offsetL; x < width-offsetR; x++ ) {

				Arrays.fill(total,0);
				int weight = 0;

				for( int i = -y; i <= offsetR; i++ ) {
					int indexSrc = src.startIndex + (y+i)*src.stride + (x-offsetL)*numBands;
					int indexKer = (i+offsetL)*kernelWidth;

					for( int j = -offsetL; j <= offsetR; j++ ) {
						int w = kernel.data[indexKer+j+offsetL];
						weight += w;
						for (int band = 0; band < numBands; band++) {
							total[band] += (dataSrc[indexSrc++]& 0xFF) * w;
						}
					}
				}

				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (byte)((total[band]+weight/2)/weight);
				}
			}
		}

		// convolve across the bottom border
		for (int y = height-offsetR; y < height; y++) {

			int maxI = height - y - 1;
			int indexDst = dst.startIndex + y*dst.stride + offsetL*numBands;

			for( int x = offsetL; x < width-offsetR; x++ ) {

				Arrays.fill(total,0);
				int weight = 0;

				for( int i = -offsetL; i <= maxI; i++ ) {
					int indexSrc = src.startIndex + (y+i)*src.stride + (x-offsetL)*numBands;
					int indexKer = (i+offsetL)*kernelWidth;

					for( int j = -offsetL; j <= offsetR; j++ ) {
						int w = kernel.data[indexKer+j+offsetL];
						weight += w;
						for (int band = 0; band < numBands; band++) {
							total[band] += (dataSrc[indexSrc++]& 0xFF) * w;
						}
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (byte)((total[band]+weight/2)/weight);
				}
			}
		}
	}

	public static void horizontal(Kernel1D_S32 kernel, InterleavedS16 src, InterleavedI16 dst ) {
		final short[] dataSrc = src.data;
		final short[] dataDst = dst.data;

		final int kernelWidth = kernel.getWidth();
		final int offsetL = kernel.getOffset();
		final int offsetR = kernelWidth-offsetL-1;

		final int width = src.getWidth();
		final int height = src.getHeight();
		final int numBands = src.getNumBands();
		final int[] total = new int[ numBands ];

		for (int i = 0; i < height; i++) {
			int indexDst = dst.startIndex + i*dst.stride;
			for (int j = 0; j < offsetL; j++) {
				int indexSrc = src.startIndex + i*src.stride;
				Arrays.fill(total,0);
				int weight = 0;
				for (int k = offsetL-j; k < kernelWidth; k++) {
					int w = kernel.data[k];
					weight += w;
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc++]) * w;
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (short)((total[band]+weight/2)/weight);
				}
			}

			indexDst = dst.startIndex + i*dst.stride + (width-offsetR)*numBands;
			for (int j = offsetR-1; j >= 0; j--) {
				int indexSrc = src.startIndex + i*src.stride + (width-offsetL-j-1)*numBands;
				Arrays.fill(total,0);
				int weight = 0;
				for (int k = 0; k <= offsetL+j; k++) {
					int w = kernel.data[k];
					weight += w;
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc++]) * w;
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (short)((total[band]+weight/2)/weight);
				}
			}
		}
	}

	public static void vertical(Kernel1D_S32 kernel, InterleavedS16 input, InterleavedI16 output ) {
		final short[] dataSrc = input.data;
		final short[] dataDst = output.data;

		final int kernelWidth = kernel.getWidth();
		final int offsetL = kernel.getOffset();
		final int offsetR = kernelWidth-offsetL-1;

		final int imgWidth = output.getWidth();
		final int imgHeight = output.getHeight();
		final int numBands = output.getNumBands();
		final int total[] = new int[ numBands ];

		final int yEnd = imgHeight - offsetR;

		for (int y = 0; y < offsetL; y++) {
			int indexDst = output.startIndex + y*output.stride;
			int i = input.startIndex + y*input.stride;
			final int iEnd = i + imgWidth*numBands;

			int kStart = offsetL - y;

			int weight = 0;
			for (int k = kStart; k < kernelWidth; k++) {
				weight += kernel.data[k];
			}

			for ( ; i < iEnd; i += numBands) {
				Arrays.fill(total,0);
				int indexSrc = i - y * input.stride;
				for (int k = kStart; k < kernelWidth; k++, indexSrc += input.stride) {
					int w = kernel.data[k];
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc+band]) * w;
					}
				}
				for (int band = 0; band < numBands; band++){
					dataDst[indexDst++] = (short)((total[band]+weight/2)/weight);
				}
			}
		}

		for (int y = yEnd; y < imgHeight; y++) {
			int indexDst = output.startIndex + y * output.stride;
			int i = input.startIndex + y*input.stride;
			final int iEnd = i + imgWidth*numBands;

			int kEnd = imgHeight - (y - offsetL);

			int weight = 0;
			for (int k = 0; k < kEnd; k++) {
				weight += kernel.data[k];
			}

			for ( ; i < iEnd; i+=numBands ) {
				Arrays.fill(total,0);
				int indexSrc = i - offsetL*input.stride;
				for (int k = 0; k < kEnd; k++, indexSrc += input.stride) {
					int w = kernel.data[k];
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc+band]) * w;
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (short)((total[band]+weight/2)/weight);
				}
			}
		}
	}

	public static void convolve(Kernel2D_S32 kernel, InterleavedS16 src, InterleavedI16 dst ) {
		final short[] dataSrc = src.data;
		final short[] dataDst = dst.data;

		final int kernelWidth = kernel.getWidth();
		final int offsetL = kernel.getOffset();
		final int offsetR = kernelWidth-offsetL-1;

		final int width = src.getWidth();
		final int height = src.getHeight();
		final int numBands = dst.getNumBands();
		final int total[] = new int[ numBands ];

		// convolve across the left and right borders
		for (int y = 0; y < height; y++) {

			int minI = y >= offsetL ? -offsetL : -y;
			int maxI = y < height - offsetR ?  offsetR : height - y - 1;

			int indexDst = dst.startIndex + y*dst.stride;

			for( int x = 0; x < offsetL; x++ ) {

				Arrays.fill(total,0);
				int weight = 0;

				for( int i = minI; i <= maxI; i++ ) {
					int indexSrc = src.startIndex + (y+i)*src.stride;
					int indexKer = (i+offsetL)*kernelWidth;

					for( int j = -x; j <= offsetR; j++ ) {
						int w = kernel.data[indexKer+j+offsetL];
						weight += w;
						for (int band = 0; band < numBands; band++) {
							total[band] += (dataSrc[indexSrc++]) * w;
						}
					}
				}

				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (short)((total[band]+weight/2)/weight);
				}
			}

			indexDst = dst.startIndex + y*dst.stride + (width-offsetR)*numBands;
			for( int x = width-offsetR; x < width; x++ ) {

				int maxJ = width-x-1;

				Arrays.fill(total,0);
				int weight = 0;

				for( int i = minI; i <= maxI; i++ ) {
					int indexSrc = src.startIndex + (y+i)*src.stride + (x-offsetL)*numBands;
					int indexKer = (i+offsetL)*kernelWidth;

					for( int j = -offsetL; j <= maxJ; j++ ) {
						int w = kernel.data[indexKer+j+offsetL];
						weight += w;
						for (int band = 0; band < numBands; band++) {
							total[band] += (dataSrc[indexSrc++]) * w;
						}
					}
				}

				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (short)((total[band]+weight/2)/weight);
				}
			}
		}

		// convolve across the top border while avoiding convolving the corners again
		for (int y = 0; y < offsetL; y++) {

			int indexDst = dst.startIndex + y*dst.stride + offsetL*numBands;

			for( int x = offsetL; x < width-offsetR; x++ ) {

				Arrays.fill(total,0);
				int weight = 0;

				for( int i = -y; i <= offsetR; i++ ) {
					int indexSrc = src.startIndex + (y+i)*src.stride + (x-offsetL)*numBands;
					int indexKer = (i+offsetL)*kernelWidth;

					for( int j = -offsetL; j <= offsetR; j++ ) {
						int w = kernel.data[indexKer+j+offsetL];
						weight += w;
						for (int band = 0; band < numBands; band++) {
							total[band] += (dataSrc[indexSrc++]) * w;
						}
					}
				}

				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (short)((total[band]+weight/2)/weight);
				}
			}
		}

		// convolve across the bottom border
		for (int y = height-offsetR; y < height; y++) {

			int maxI = height - y - 1;
			int indexDst = dst.startIndex + y*dst.stride + offsetL*numBands;

			for( int x = offsetL; x < width-offsetR; x++ ) {

				Arrays.fill(total,0);
				int weight = 0;

				for( int i = -offsetL; i <= maxI; i++ ) {
					int indexSrc = src.startIndex + (y+i)*src.stride + (x-offsetL)*numBands;
					int indexKer = (i+offsetL)*kernelWidth;

					for( int j = -offsetL; j <= offsetR; j++ ) {
						int w = kernel.data[indexKer+j+offsetL];
						weight += w;
						for (int band = 0; band < numBands; band++) {
							total[band] += (dataSrc[indexSrc++]) * w;
						}
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (short)((total[band]+weight/2)/weight);
				}
			}
		}
	}

	public static void horizontal(Kernel1D_S32 kernel, InterleavedU16 src, InterleavedI16 dst ) {
		final short[] dataSrc = src.data;
		final short[] dataDst = dst.data;

		final int kernelWidth = kernel.getWidth();
		final int offsetL = kernel.getOffset();
		final int offsetR = kernelWidth-offsetL-1;

		final int width = src.getWidth();
		final int height = src.getHeight();
		final int numBands = src.getNumBands();
		final int[] total = new int[ numBands ];

		for (int i = 0; i < height; i++) {
			int indexDst = dst.startIndex + i*dst.stride;
			for (int j = 0; j < offsetL; j++) {
				int indexSrc = src.startIndex + i*src.stride;
				Arrays.fill(total,0);
				int weight = 0;
				for (int k = offsetL-j; k < kernelWidth; k++) {
					int w = kernel.data[k];
					weight += w;
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc++]& 0xFFFF) * w;
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (short)((total[band]+weight/2)/weight);
				}
			}

			indexDst = dst.startIndex + i*dst.stride + (width-offsetR)*numBands;
			for (int j = offsetR-1; j >= 0; j--) {
				int indexSrc = src.startIndex + i*src.stride + (width-offsetL-j-1)*numBands;
				Arrays.fill(total,0);
				int weight = 0;
				for (int k = 0; k <= offsetL+j; k++) {
					int w = kernel.data[k];
					weight += w;
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc++]& 0xFFFF) * w;
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (short)((total[band]+weight/2)/weight);
				}
			}
		}
	}

	public static void vertical(Kernel1D_S32 kernel, InterleavedU16 input, InterleavedI16 output ) {
		final short[] dataSrc = input.data;
		final short[] dataDst = output.data;

		final int kernelWidth = kernel.getWidth();
		final int offsetL = kernel.getOffset();
		final int offsetR = kernelWidth-offsetL-1;

		final int imgWidth = output.getWidth();
		final int imgHeight = output.getHeight();
		final int numBands = output.getNumBands();
		final int total[] = new int[ numBands ];

		final int yEnd = imgHeight - offsetR;

		for (int y = 0; y < offsetL; y++) {
			int indexDst = output.startIndex + y*output.stride;
			int i = input.startIndex + y*input.stride;
			final int iEnd = i + imgWidth*numBands;

			int kStart = offsetL - y;

			int weight = 0;
			for (int k = kStart; k < kernelWidth; k++) {
				weight += kernel.data[k];
			}

			for ( ; i < iEnd; i += numBands) {
				Arrays.fill(total,0);
				int indexSrc = i - y * input.stride;
				for (int k = kStart; k < kernelWidth; k++, indexSrc += input.stride) {
					int w = kernel.data[k];
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc+band]& 0xFFFF) * w;
					}
				}
				for (int band = 0; band < numBands; band++){
					dataDst[indexDst++] = (short)((total[band]+weight/2)/weight);
				}
			}
		}

		for (int y = yEnd; y < imgHeight; y++) {
			int indexDst = output.startIndex + y * output.stride;
			int i = input.startIndex + y*input.stride;
			final int iEnd = i + imgWidth*numBands;

			int kEnd = imgHeight - (y - offsetL);

			int weight = 0;
			for (int k = 0; k < kEnd; k++) {
				weight += kernel.data[k];
			}

			for ( ; i < iEnd; i+=numBands ) {
				Arrays.fill(total,0);
				int indexSrc = i - offsetL*input.stride;
				for (int k = 0; k < kEnd; k++, indexSrc += input.stride) {
					int w = kernel.data[k];
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc+band]& 0xFFFF) * w;
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (short)((total[band]+weight/2)/weight);
				}
			}
		}
	}

	public static void convolve(Kernel2D_S32 kernel, InterleavedU16 src, InterleavedI16 dst ) {
		final short[] dataSrc = src.data;
		final short[] dataDst = dst.data;

		final int kernelWidth = kernel.getWidth();
		final int offsetL = kernel.getOffset();
		final int offsetR = kernelWidth-offsetL-1;

		final int width = src.getWidth();
		final int height = src.getHeight();
		final int numBands = dst.getNumBands();
		final int total[] = new int[ numBands ];

		// convolve across the left and right borders
		for (int y = 0; y < height; y++) {

			int minI = y >= offsetL ? -offsetL : -y;
			int maxI = y < height - offsetR ?  offsetR : height - y - 1;

			int indexDst = dst.startIndex + y*dst.stride;

			for( int x = 0; x < offsetL; x++ ) {

				Arrays.fill(total,0);
				int weight = 0;

				for( int i = minI; i <= maxI; i++ ) {
					int indexSrc = src.startIndex + (y+i)*src.stride;
					int indexKer = (i+offsetL)*kernelWidth;

					for( int j = -x; j <= offsetR; j++ ) {
						int w = kernel.data[indexKer+j+offsetL];
						weight += w;
						for (int band = 0; band < numBands; band++) {
							total[band] += (dataSrc[indexSrc++]& 0xFFFF) * w;
						}
					}
				}

				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (short)((total[band]+weight/2)/weight);
				}
			}

			indexDst = dst.startIndex + y*dst.stride + (width-offsetR)*numBands;
			for( int x = width-offsetR; x < width; x++ ) {

				int maxJ = width-x-1;

				Arrays.fill(total,0);
				int weight = 0;

				for( int i = minI; i <= maxI; i++ ) {
					int indexSrc = src.startIndex + (y+i)*src.stride + (x-offsetL)*numBands;
					int indexKer = (i+offsetL)*kernelWidth;

					for( int j = -offsetL; j <= maxJ; j++ ) {
						int w = kernel.data[indexKer+j+offsetL];
						weight += w;
						for (int band = 0; band < numBands; band++) {
							total[band] += (dataSrc[indexSrc++]& 0xFFFF) * w;
						}
					}
				}

				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (short)((total[band]+weight/2)/weight);
				}
			}
		}

		// convolve across the top border while avoiding convolving the corners again
		for (int y = 0; y < offsetL; y++) {

			int indexDst = dst.startIndex + y*dst.stride + offsetL*numBands;

			for( int x = offsetL; x < width-offsetR; x++ ) {

				Arrays.fill(total,0);
				int weight = 0;

				for( int i = -y; i <= offsetR; i++ ) {
					int indexSrc = src.startIndex + (y+i)*src.stride + (x-offsetL)*numBands;
					int indexKer = (i+offsetL)*kernelWidth;

					for( int j = -offsetL; j <= offsetR; j++ ) {
						int w = kernel.data[indexKer+j+offsetL];
						weight += w;
						for (int band = 0; band < numBands; band++) {
							total[band] += (dataSrc[indexSrc++]& 0xFFFF) * w;
						}
					}
				}

				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (short)((total[band]+weight/2)/weight);
				}
			}
		}

		// convolve across the bottom border
		for (int y = height-offsetR; y < height; y++) {

			int maxI = height - y - 1;
			int indexDst = dst.startIndex + y*dst.stride + offsetL*numBands;

			for( int x = offsetL; x < width-offsetR; x++ ) {

				Arrays.fill(total,0);
				int weight = 0;

				for( int i = -offsetL; i <= maxI; i++ ) {
					int indexSrc = src.startIndex + (y+i)*src.stride + (x-offsetL)*numBands;
					int indexKer = (i+offsetL)*kernelWidth;

					for( int j = -offsetL; j <= offsetR; j++ ) {
						int w = kernel.data[indexKer+j+offsetL];
						weight += w;
						for (int band = 0; band < numBands; band++) {
							total[band] += (dataSrc[indexSrc++]& 0xFFFF) * w;
						}
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (short)((total[band]+weight/2)/weight);
				}
			}
		}
	}

	public static void horizontal(Kernel1D_S32 kernel, InterleavedS32 src, InterleavedS32 dst ) {
		final int[] dataSrc = src.data;
		final int[] dataDst = dst.data;

		final int kernelWidth = kernel.getWidth();
		final int offsetL = kernel.getOffset();
		final int offsetR = kernelWidth-offsetL-1;

		final int width = src.getWidth();
		final int height = src.getHeight();
		final int numBands = src.getNumBands();
		final int[] total = new int[ numBands ];

		for (int i = 0; i < height; i++) {
			int indexDst = dst.startIndex + i*dst.stride;
			for (int j = 0; j < offsetL; j++) {
				int indexSrc = src.startIndex + i*src.stride;
				Arrays.fill(total,0);
				int weight = 0;
				for (int k = offsetL-j; k < kernelWidth; k++) {
					int w = kernel.data[k];
					weight += w;
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc++]) * w;
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = ((total[band]+weight/2)/weight);
				}
			}

			indexDst = dst.startIndex + i*dst.stride + (width-offsetR)*numBands;
			for (int j = offsetR-1; j >= 0; j--) {
				int indexSrc = src.startIndex + i*src.stride + (width-offsetL-j-1)*numBands;
				Arrays.fill(total,0);
				int weight = 0;
				for (int k = 0; k <= offsetL+j; k++) {
					int w = kernel.data[k];
					weight += w;
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc++]) * w;
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = ((total[band]+weight/2)/weight);
				}
			}
		}
	}

	public static void vertical(Kernel1D_S32 kernel, InterleavedS32 input, InterleavedS32 output ) {
		final int[] dataSrc = input.data;
		final int[] dataDst = output.data;

		final int kernelWidth = kernel.getWidth();
		final int offsetL = kernel.getOffset();
		final int offsetR = kernelWidth-offsetL-1;

		final int imgWidth = output.getWidth();
		final int imgHeight = output.getHeight();
		final int numBands = output.getNumBands();
		final int total[] = new int[ numBands ];

		final int yEnd = imgHeight - offsetR;

		for (int y = 0; y < offsetL; y++) {
			int indexDst = output.startIndex + y*output.stride;
			int i = input.startIndex + y*input.stride;
			final int iEnd = i + imgWidth*numBands;

			int kStart = offsetL - y;

			int weight = 0;
			for (int k = kStart; k < kernelWidth; k++) {
				weight += kernel.data[k];
			}

			for ( ; i < iEnd; i += numBands) {
				Arrays.fill(total,0);
				int indexSrc = i - y * input.stride;
				for (int k = kStart; k < kernelWidth; k++, indexSrc += input.stride) {
					int w = kernel.data[k];
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc+band]) * w;
					}
				}
				for (int band = 0; band < numBands; band++){
					dataDst[indexDst++] = ((total[band]+weight/2)/weight);
				}
			}
		}

		for (int y = yEnd; y < imgHeight; y++) {
			int indexDst = output.startIndex + y * output.stride;
			int i = input.startIndex + y*input.stride;
			final int iEnd = i + imgWidth*numBands;

			int kEnd = imgHeight - (y - offsetL);

			int weight = 0;
			for (int k = 0; k < kEnd; k++) {
				weight += kernel.data[k];
			}

			for ( ; i < iEnd; i+=numBands ) {
				Arrays.fill(total,0);
				int indexSrc = i - offsetL*input.stride;
				for (int k = 0; k < kEnd; k++, indexSrc += input.stride) {
					int w = kernel.data[k];
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc+band]) * w;
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = ((total[band]+weight/2)/weight);
				}
			}
		}
	}

	public static void convolve(Kernel2D_S32 kernel, InterleavedS32 src, InterleavedS32 dst ) {
		final int[] dataSrc = src.data;
		final int[] dataDst = dst.data;

		final int kernelWidth = kernel.getWidth();
		final int offsetL = kernel.getOffset();
		final int offsetR = kernelWidth-offsetL-1;

		final int width = src.getWidth();
		final int height = src.getHeight();
		final int numBands = dst.getNumBands();
		final int total[] = new int[ numBands ];

		// convolve across the left and right borders
		for (int y = 0; y < height; y++) {

			int minI = y >= offsetL ? -offsetL : -y;
			int maxI = y < height - offsetR ?  offsetR : height - y - 1;

			int indexDst = dst.startIndex + y*dst.stride;

			for( int x = 0; x < offsetL; x++ ) {

				Arrays.fill(total,0);
				int weight = 0;

				for( int i = minI; i <= maxI; i++ ) {
					int indexSrc = src.startIndex + (y+i)*src.stride;
					int indexKer = (i+offsetL)*kernelWidth;

					for( int j = -x; j <= offsetR; j++ ) {
						int w = kernel.data[indexKer+j+offsetL];
						weight += w;
						for (int band = 0; band < numBands; band++) {
							total[band] += (dataSrc[indexSrc++]) * w;
						}
					}
				}

				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = ((total[band]+weight/2)/weight);
				}
			}

			indexDst = dst.startIndex + y*dst.stride + (width-offsetR)*numBands;
			for( int x = width-offsetR; x < width; x++ ) {

				int maxJ = width-x-1;

				Arrays.fill(total,0);
				int weight = 0;

				for( int i = minI; i <= maxI; i++ ) {
					int indexSrc = src.startIndex + (y+i)*src.stride + (x-offsetL)*numBands;
					int indexKer = (i+offsetL)*kernelWidth;

					for( int j = -offsetL; j <= maxJ; j++ ) {
						int w = kernel.data[indexKer+j+offsetL];
						weight += w;
						for (int band = 0; band < numBands; band++) {
							total[band] += (dataSrc[indexSrc++]) * w;
						}
					}
				}

				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = ((total[band]+weight/2)/weight);
				}
			}
		}

		// convolve across the top border while avoiding convolving the corners again
		for (int y = 0; y < offsetL; y++) {

			int indexDst = dst.startIndex + y*dst.stride + offsetL*numBands;

			for( int x = offsetL; x < width-offsetR; x++ ) {

				Arrays.fill(total,0);
				int weight = 0;

				for( int i = -y; i <= offsetR; i++ ) {
					int indexSrc = src.startIndex + (y+i)*src.stride + (x-offsetL)*numBands;
					int indexKer = (i+offsetL)*kernelWidth;

					for( int j = -offsetL; j <= offsetR; j++ ) {
						int w = kernel.data[indexKer+j+offsetL];
						weight += w;
						for (int band = 0; band < numBands; band++) {
							total[band] += (dataSrc[indexSrc++]) * w;
						}
					}
				}

				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = ((total[band]+weight/2)/weight);
				}
			}
		}

		// convolve across the bottom border
		for (int y = height-offsetR; y < height; y++) {

			int maxI = height - y - 1;
			int indexDst = dst.startIndex + y*dst.stride + offsetL*numBands;

			for( int x = offsetL; x < width-offsetR; x++ ) {

				Arrays.fill(total,0);
				int weight = 0;

				for( int i = -offsetL; i <= maxI; i++ ) {
					int indexSrc = src.startIndex + (y+i)*src.stride + (x-offsetL)*numBands;
					int indexKer = (i+offsetL)*kernelWidth;

					for( int j = -offsetL; j <= offsetR; j++ ) {
						int w = kernel.data[indexKer+j+offsetL];
						weight += w;
						for (int band = 0; band < numBands; band++) {
							total[band] += (dataSrc[indexSrc++]) * w;
						}
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = ((total[band]+weight/2)/weight);
				}
			}
		}
	}

	public static void vertical(Kernel1D_S32 kernelX, Kernel1D_S32 kernelY,
								InterleavedU16 src, InterleavedI8 dst ) {
		final short[] dataSrc = src.data;
		final byte[] dataDst = dst.data;

		final int offsetY = kernelY.getOffset();
		final int kernelWidthY = kernelY.getWidth();

		final int offsetX = kernelX.getOffset();
		final int kernelWidthX = kernelX.getWidth();
		final int offsetX1 = kernelWidthX-offsetX-1;

		final int imgWidth = dst.getWidth();
		final int imgHeight = dst.getHeight();
		final int numBands = dst.getNumBands();
		final int total[] = new int[ numBands ];

		final int yEnd = imgHeight - (kernelWidthY-offsetY-1);

		int startWeightX = 0;
		for (int k = offsetX; k < kernelWidthX; k++) {
			startWeightX += kernelX.data[k];
		}

		for (int y = 0; y < offsetY; y++) {
			int indexDst = dst.startIndex + y*dst.stride;
			int i = src.startIndex + y*src.stride;
			final int iEnd = i + imgWidth*numBands;

			int kStart = offsetY - y;

			int weightY = 0;
			for (int k = kStart; k < kernelWidthY; k++) {
				weightY += kernelY.data[k];
			}
			int weightX = startWeightX;

			for ( int x = 0; i < iEnd; i += numBands, x++ ) {
				int weight = weightX*weightY;
				Arrays.fill(total,0);
				int indexSrc = i - y*src.stride;
				for (int k = kStart; k < kernelWidthY; k++, indexSrc += src.stride) {
					int w = kernelY.data[k];
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc+band]& 0xFFFF) * w;
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (byte)((total[band]+weight/2)/weight);
				}
				if( x < offsetX ) {
					weightX += kernelX.data[offsetX-x-1];
				} else if( x >= src.width-(kernelWidthX-offsetX) ) {
					weightX -= kernelX.data[src.width-x+offsetX-1];
				}
			}
		}

		for (int y = yEnd; y < imgHeight; y++) {
			int indexDst = dst.startIndex + y*dst.stride;
			int i = src.startIndex + y*src.stride;
			final int iEnd = i + imgWidth*numBands;

			int kEnd = imgHeight - (y - offsetY);

			int weightY = 0;
			for (int k = 0; k < kEnd; k++) {
				weightY += kernelY.data[k];
			}
			int weightX = startWeightX;

			for ( int x = 0; i < iEnd; i += numBands, x++ ) {
				int weight = weightX*weightY;
				Arrays.fill(total,0);
				int indexSrc = i - offsetY*src.stride;
				for (int k = 0; k < kEnd; k++, indexSrc += src.stride) {
					int w = kernelY.data[k];
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc+band]& 0xFFFF)*w;
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (byte)((total[band]+weight/2)/weight);
				}
				if( x < offsetX ) {
					weightX += kernelX.data[offsetX-x-1];
				} else if( x >= src.width-(kernelWidthX-offsetX) ) {
					weightX -= kernelX.data[src.width-x+offsetX-1];
				}
			}
		}

		// left and right border
		int weightY = kernelY.computeSum();
		for (int y = offsetY; y < yEnd; y++) {
			int indexDst = dst.startIndex + y*dst.stride;
			int i = src.startIndex + y*src.stride;

			// left side
			int iEnd = i + offsetY*numBands;
			int weightX = startWeightX;
			for ( int x = 0; i < iEnd; i += numBands, x++ ) {
				int weight = weightX*weightY;
				Arrays.fill(total,0);
				int indexSrc = i - offsetY*src.stride;
				for (int k = 0; k < kernelWidthY; k++, indexSrc += src.stride) {
					int w = kernelY.data[k];
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc+band]& 0xFFFF) * w;
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (byte)((total[band]+weight/2)/weight);
				}
				weightX += kernelX.data[offsetX-x-1];
			}

			// right side
			int startX = src.width-offsetX1;
			indexDst = dst.startIndex + y*dst.stride + startX*numBands;
			i = src.startIndex + y*src.stride + startX*numBands;
			iEnd = src.startIndex + y * src.stride + src.width*numBands;
			for ( int x = startX; i < iEnd; i += numBands, x++ ) {
				weightX -= kernelX.data[src.width-x+offsetX];
				int weight = weightX*weightY;
				Arrays.fill(total,0);
				int indexSrc = i - offsetY*src.stride;
				for (int k = 0; k < kernelWidthY; k++, indexSrc += src.stride) {
					int w = kernelY.data[k];
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc+band]& 0xFFFF)*w;
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (byte)((total[band]+weight/2)/weight);
				}
			}
		}
	}

	public static void vertical(Kernel1D_S32 kernelX, Kernel1D_S32 kernelY,
								InterleavedS32 src, InterleavedI16 dst ) {
		final int[] dataSrc = src.data;
		final short[] dataDst = dst.data;

		final int offsetY = kernelY.getOffset();
		final int kernelWidthY = kernelY.getWidth();

		final int offsetX = kernelX.getOffset();
		final int kernelWidthX = kernelX.getWidth();
		final int offsetX1 = kernelWidthX-offsetX-1;

		final int imgWidth = dst.getWidth();
		final int imgHeight = dst.getHeight();
		final int numBands = dst.getNumBands();
		final int total[] = new int[ numBands ];

		final int yEnd = imgHeight - (kernelWidthY-offsetY-1);

		int startWeightX = 0;
		for (int k = offsetX; k < kernelWidthX; k++) {
			startWeightX += kernelX.data[k];
		}

		for (int y = 0; y < offsetY; y++) {
			int indexDst = dst.startIndex + y*dst.stride;
			int i = src.startIndex + y*src.stride;
			final int iEnd = i + imgWidth*numBands;

			int kStart = offsetY - y;

			int weightY = 0;
			for (int k = kStart; k < kernelWidthY; k++) {
				weightY += kernelY.data[k];
			}
			int weightX = startWeightX;

			for ( int x = 0; i < iEnd; i += numBands, x++ ) {
				int weight = weightX*weightY;
				Arrays.fill(total,0);
				int indexSrc = i - y*src.stride;
				for (int k = kStart; k < kernelWidthY; k++, indexSrc += src.stride) {
					int w = kernelY.data[k];
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc+band]) * w;
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (short)((total[band]+weight/2)/weight);
				}
				if( x < offsetX ) {
					weightX += kernelX.data[offsetX-x-1];
				} else if( x >= src.width-(kernelWidthX-offsetX) ) {
					weightX -= kernelX.data[src.width-x+offsetX-1];
				}
			}
		}

		for (int y = yEnd; y < imgHeight; y++) {
			int indexDst = dst.startIndex + y*dst.stride;
			int i = src.startIndex + y*src.stride;
			final int iEnd = i + imgWidth*numBands;

			int kEnd = imgHeight - (y - offsetY);

			int weightY = 0;
			for (int k = 0; k < kEnd; k++) {
				weightY += kernelY.data[k];
			}
			int weightX = startWeightX;

			for ( int x = 0; i < iEnd; i += numBands, x++ ) {
				int weight = weightX*weightY;
				Arrays.fill(total,0);
				int indexSrc = i - offsetY*src.stride;
				for (int k = 0; k < kEnd; k++, indexSrc += src.stride) {
					int w = kernelY.data[k];
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc+band])*w;
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (short)((total[band]+weight/2)/weight);
				}
				if( x < offsetX ) {
					weightX += kernelX.data[offsetX-x-1];
				} else if( x >= src.width-(kernelWidthX-offsetX) ) {
					weightX -= kernelX.data[src.width-x+offsetX-1];
				}
			}
		}

		// left and right border
		int weightY = kernelY.computeSum();
		for (int y = offsetY; y < yEnd; y++) {
			int indexDst = dst.startIndex + y*dst.stride;
			int i = src.startIndex + y*src.stride;

			// left side
			int iEnd = i + offsetY*numBands;
			int weightX = startWeightX;
			for ( int x = 0; i < iEnd; i += numBands, x++ ) {
				int weight = weightX*weightY;
				Arrays.fill(total,0);
				int indexSrc = i - offsetY*src.stride;
				for (int k = 0; k < kernelWidthY; k++, indexSrc += src.stride) {
					int w = kernelY.data[k];
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc+band]) * w;
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (short)((total[band]+weight/2)/weight);
				}
				weightX += kernelX.data[offsetX-x-1];
			}

			// right side
			int startX = src.width-offsetX1;
			indexDst = dst.startIndex + y*dst.stride + startX*numBands;
			i = src.startIndex + y*src.stride + startX*numBands;
			iEnd = src.startIndex + y * src.stride + src.width*numBands;
			for ( int x = startX; i < iEnd; i += numBands, x++ ) {
				weightX -= kernelX.data[src.width-x+offsetX];
				int weight = weightX*weightY;
				Arrays.fill(total,0);
				int indexSrc = i - offsetY*src.stride;
				for (int k = 0; k < kernelWidthY; k++, indexSrc += src.stride) {
					int w = kernelY.data[k];
					for (int band = 0; band < numBands; band++) {
						total[band] += (dataSrc[indexSrc+band])*w;
					}
				}
				for (int band = 0; band < numBands; band++) {
					dataDst[indexDst++] = (short)((total[band]+weight/2)/weight);
				}
			}
		}
	}

}
